########################################################################################
# Script: Perpectief op Nederlanderschap: het effect van de snelheid van naturalisatie #
# Floris Peters, Maarten Vink & Hans Schmeets                                          #
# Bevolkingstrends                                                                     #
########################################################################################


################
# Introduction #
################

# This file provides the syntax used to create all the tables and figures in the paper. 
# The dataset contains sensitive, micro level information. As such, for privacy reasons the data is only available to individuals employed at or affiliated to Statistics Netherlands. 
# The dataset can be found at the following location on the network of Statistics Netherlands: \\cbsp.nl\Productie\Projecten\SAL\209253UM_FP_SEC1\Werk\Floris\PhD\BT_employment 

#######################################################################################################################
#######################################################################################################################

#############
# Variables #
#############

# ID
# (Individual identification number)
 
# EMPLOYMENT
# (Employment status)
# [0] Not employed; 
# [1] Employed

# NATURALIZATION
# (Possession of Dutch citizenship)
# [0] No; 
# [1] Yes

# NATURALIZATIONSPEED
# (Speed of naturalization)
# [0] No naturalization;
# [1] 1-3 years;
# [2] 4 years;
# [3] 5 years;
# [4] 6 years;
# [5] 7 years;
# [6] 8-10 years

# GENDER
# [1] Male; 
# [2] Female

# AGE
# (Age at the moment of migration)

# YSM
# (Years since migration)

# PARTNER
# [1] No partner; 
# [2] Foreign born foreign partner; 
# [3] Foreign born Dutch partner; 
# [4] Native Dutch partner

# CHILD
# (Children in the household in categories)
# [0] no children <18 in household; 
# [1] Children <18 in household

# HDI
# (Human Development Index (HDI) score origin country)

# HDI_CAT
# (Human Development Index (HDI) score origin country, in categories by median)
# [0] Low development; 
# [1] High development

# EU
# [0] Not EU country of origin; 
# [1] EU country of origin

# WESTERN
# (Western country of origin according to CBS definition)
# [0] No; 
# [1] Yes

#######################################################################################################################
#######################################################################################################################

#Upload the dataset and load the necessary libraries#
library(Matrix)
library(lme4)
library(optimx)
dataset_main <- read.csv(file.choose(),header=T,sep=";")


#############     
# Table 2.1 #
#############  

#descriptive statistics
prop.table(table(dataset_main$WESTERN,dataset_main$HDI_CAT),1)


#############     
# Table 3.1 #
############# 

#specify subsets
dataset_male <- subset(dataset_main, GENDER == 1)
dataset_female <- subset(dataset_main, GENDER == 2)
dataset_male_western <- subset(dataset_main, GENDER == 1 & WESTERN == 1)
dataset_female_western <- subset(dataset_main, GENDER == 2 & WESTERN == 1)
dataset_male_nonwestern <- subset(dataset_main, GENDER == 1 & WESTERN == 0)
dataset_female_nonwestern <- subset(dataset_main, GENDER == 2 & WESTERN == 0)
dataset_male_lowHDI <- subset(dataset_main, GENDER == 1 & HDI_CAT == 0)
dataset_female_lowHDI <- subset(dataset_main, GENDER == 2 & HDI_CAT == 0)
dataset_male_highHDI <- subset(dataset_main, GENDER == 1 & HDI_CAT == 1)
dataset_female_highHDI <- subset(dataset_main, GENDER == 2 & HDI_CAT == 1)

#descriptive statistics per subset
prop.table(table(dataset_main$GENDER,dataset_main$NATURALIZATIONSPEED),1)
prop.table(dataset_main$NATURALIZATION)
prop.table(table(dataset_male$WESTERN,dataset_male$NATURALIZATIONSPEED),1)
prop.table(table(dataset_male$WESTERN,dataset_male$NATURALIZATION),1)
prop.table(table(dataset_female$WESTERN,dataset_female$NATURALIZATIONSPEED),1)
prop.table(table(dataset_female$WESTERN,dataset_female$NATURALIZATION),1)
prop.table(table(dataset_male$HDI_CAT,dataset_male$NATURALIZATIONSPEED),1)
prop.table(table(dataset_male$HDI_CAT,dataset_male$NATURALIZATION),1)
prop.table(table(dataset_female$HDI_CAT,dataset_female$NATURALIZATIONSPEED),1)
prop.table(table(dataset_female$HDI_CAT,dataset_female$NATURALIZATION),1)

#sample sizes
length(dataset_male$ID)
length(unique(dataset_male$ID))
length(dataset_female$ID)
length(unique(dataset_female$ID))
length(dataset_male_western$ID)
length(unique(dataset_male_western$ID))
length(dataset_female_western$ID)
length(unique(dataset_female_western$ID))
length(dataset_male_nonwestern$ID)
length(unique(dataset_male_nonwestern$ID))
length(dataset_female_nonwestern$ID)
length(unique(dataset_female_nonwestern$ID))
length(dataset_male_lowHDI$ID)
length(unique(dataset_male_lowHDI$ID))
length(dataset_female_lowHDI$ID)
length(unique(dataset_female_lowHDI$ID))
length(dataset_male_highHDI$ID)
length(unique(dataset_male_highHDI$ID))
length(dataset_female_highHDI$ID)
length(unique(dataset_female_highHDI$ID))


#############     
# Table 3.2 #
############# 

#specify subsets
dataset_male <- subset(dataset_main, GENDER == 1)
dataset_female <- subset(dataset_main, GENDER == 2)

#create employment lags
dataset_male$EMPLOYMENT_LAG <- lag(dataset_male$EMPLOYMENT, n = 1L)
dataset_female$EMPLOYMENT_LAG <- lag(dataset_female$EMPLOYMENT, n = 1L)

#logistic regression analyses
result3.2_male <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EU + EMPLOYMENTLAG,
                      data = dataset_male, family = "binomial")
result3.2_female <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EU + EMPLOYMENTLAG,
                      data = dataset_female, family = "binomial")

#sample sizes
length(dataset_male$ID)
length(unique(dataset_male$ID))
length(dataset_female$ID)
length(unique(dataset_female$ID))


#############     
# Table 3.3 #
############# 

#specify subsets
dataset_male_western <- subset(dataset_main, GENDER == 1 & WESTERN == 1)
dataset_male_nonwestern <- subset(dataset_main, GENDER == 1 & WESTERN == 0)

#create employment lags
dataset_male_western$EMPLOYMENT_LAG <- lag(dataset_male_western$EMPLOYMENT, n = 1L)
dataset_male_western$EMPLOYMENT_LAG <- lag(dataset_male_western$EMPLOYMENT, n = 1L)

#logistic regression analyses
result3.3_male_western <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EMPLOYMENTLAG,
                      data = dataset_male_western, family = "binomial")
result3.3_male_non_western <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EMPLOYMENTLAG,
                      data = dataset_male_nonwestern, family = "binomial")

#sample sizes
length(dataset_male_western$ID)
length(unique(dataset_male_western$ID))
length(dataset_male_nonwestern$ID)
length(unique(dataset_male_nonwestern$ID))


#############     
# Table 3.4 #
############# 

#specify subsets
dataset_female_western <- subset(dataset_main, GENDER == 2 & WESTERN == 1)
dataset_female_nonwestern <- subset(dataset_main, GENDER == 2 & WESTERN == 0)

#create employment lags
dataset_female_western$EMPLOYMENT_LAG <- lag(dataset_female_western$EMPLOYMENT, n = 1L)
dataset_female_western$EMPLOYMENT_LAG <- lag(dataset_female_western$EMPLOYMENT, n = 1L)

#logistic regression analyses
result3.4_female_western <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EMPLOYMENTLAG,
                              data = dataset_female_western, family = "binomial")
result3.4_female_non_western <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EMPLOYMENTLAG,
                                  data = dataset_female_nonwestern, family = "binomial")

#sample sizes
length(dataset_female_western$ID)
length(unique(dataset_female_western$ID))
length(dataset_female_nonwestern$ID)
length(unique(dataset_female_nonwestern$ID))


#############     
# Table 3.5 #
############# 

#specify subsets
dataset_male_highHDI <- subset(dataset_main, GENDER == 1 & HDI_CAT == 1)
dataset_male_lowHDI <- subset(dataset_main, GENDER == 1 & HDI_CAT == 0)

#create employment lags
dataset_male_highHDI$EMPLOYMENT_LAG <- lag(dataset_male_highHDI$EMPLOYMENT, n = 1L)
dataset_male_lowHDI$EMPLOYMENT_LAG <- lag(dataset_male_lowHDI$EMPLOYMENT, n = 1L)

#logistic regression analyses
result3.5_male_highHDI <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EMPLOYMENTLAG,
                              data = dataset_male_highHDI, family = "binomial")
result3.5_male_lowHDI <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EMPLOYMENTLAG,
                                  data = dataset_male_lowHDI, family = "binomial")

#sample sizes
length(dataset_male_highHDI$ID)
length(unique(dataset_male_highHDI$ID))
length(dataset_male_lowHDI$ID)
length(unique(dataset_male_lowHDI$ID))


#############     
# Table 3.6 #
############# 

#specify subsets
dataset_female_highHDI <- subset(dataset_main, GENDER == 2 & HDI_CAT == 1)
dataset_female_lowHDI <- subset(dataset_main, GENDER == 2 & HDI_CAT == 0)

#create employment lags
dataset_female_highHDI$EMPLOYMENT_LAG <- lag(dataset_female_highHDI$EMPLOYMENT, n = 1L)
dataset_female_lowHDI$EMPLOYMENT_LAG <- lag(dataset_female_lowHDI$EMPLOYMENT, n = 1L)

#logistic regression analyses
result3.6_female_highHDI <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EMPLOYMENTLAG,
                              data = dataset_female_highHDI, family = "binomial")
result3.6_female_lowHDI <- glm(EMPLOYMENT ~ as.factor(NATURALIZATIONSPEED) + AGE + YSM + as.factor(PARTNER) + CHILD + EMPLOYMENTLAG,
                             data = dataset_female_lowHDI, family = "binomial")

#sample sizes
length(dataset_female_highHDI$ID)
length(unique(dataset_female_highHDI$ID))
length(dataset_female_lowHDI$ID)
length(unique(dataset_female_lowHDI$ID))
